# -*- coding: utf-8 -*-
"""
Created on Mon Apr  1 15:33:45 2019

@author: Derek
"""
#用python爬取猫眼电影排行榜前100

import requests
import re
import pandas as pd
import time

def get_one_page(url):
    
    
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) \
    AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
    
    response = requests.get(url, headers = headers)
    if response.status_code == 200:
        return response.text
    return None


def regrex(html):
    
   
    rank = re.findall('<dd>.*?.*?>(.*?)</i>',html,re.S)
    movie = re.findall('class="name".*?title="(.*?)"',html,re.S)
    star = re.findall('class="star">\s+(.*?)</p>',html,re.S)
    time =re.findall('releasetime">(.*?)</p>',html,re.S)
    img_url = re.findall('alt.*?src="(.*?)"\s+alt',html,re.S)
    score = re.findall('"score".*?r">(.*?\.).*?class="fraction">(.*?)</i>',html,re.S)
    scores=[]
    for i in score:
        scores.append(i[0]+i[1])
    data = pd.DataFrame({'ranking':rank,'movie_name':movie,  'actor':star,'time':time,'score':scores,'img_url':img_url},index=rank)
    data.to_csv("C:\\Users\\13917\\Desktop\\b.csv", encoding='utf_8_sig',mode='a')
   


def main(offset):
    url = 'https://maoyan.com/board/4?offset='+ str(offset)
    html = get_one_page(url)
    regrex(html)


if __name__ == '__main__':
    for i in range(10):
        main(offset=i*10)
        time.sleep(1)
    csv_data = pd.read_csv("C:\\Users\\13917\\Desktop\\b.csv")
    csv_data.dropna(axis=0, how='any', inplace=True)
    csv_data.to_csv("C:\\Users\\13917\\Desktop\\b.csv", encoding='utf_8_sig')

